# Always print this out before your assignment
sessionInfo()
R version 4.1.1 (2021-08-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19043)
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252 LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C LC_TIME=English_United States.1252
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] plotROC_2.2.1 glmnetUtils_1.1.8 glmnet_4.1-3 Matrix_1.3-4
[5] randomForestExplainer_0.10.1 randomForest_4.6-14 urbnthemes_0.0.2 tidycensus_1.1
[9] usdata_0.2.0 usmap_0.5.2.9999 urbnmapr_0.0.0.9002 corrplot_0.92
[13] gdata_2.18.0 scales_1.1.1.9000 tidyquant_1.0.3 quantmod_0.4.18
[17] TTR_0.24.2 PerformanceAnalytics_2.0.4 xts_0.12.1 zoo_1.8-9
[21] plotly_4.10.0 viridis_0.6.2 viridisLite_0.4.0 pastecs_1.3.21
[25] kableExtra_1.3.4 lubridate_1.8.0 rsample_0.1.1 ggthemes_4.2.4
[29] ggrepel_0.9.1 here_1.0.1 fs_1.5.1 forcats_0.5.1
[33] stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4 readr_2.1.1
[37] tidyr_1.1.4 tibble_3.1.6 ggplot2_3.3.5 tidyverse_1.3.1
[41] knitr_1.36
loaded via a namespace (and not attached):
[1] readxl_1.3.1 uuid_1.0-3 backports_1.4.0 systemfonts_1.0.3 plyr_1.8.6 lazyeval_0.2.2
[7] splines_4.1.1 sp_1.4-6 listenv_0.8.0 digest_0.6.29 foreach_1.5.1 htmltools_0.5.2
[13] fansi_0.5.0 magrittr_2.0.1 tzdb_0.2.0 globals_0.14.0 modelr_0.1.8 extrafont_0.17
[19] extrafontdb_1.0 svglite_2.0.0 colorspace_2.0-2 rvest_1.0.2 rappdirs_0.3.3 haven_2.4.3
[25] xfun_0.28 rgdal_1.5-27 crayon_1.4.2 jsonlite_1.7.2 survival_3.2-11 tigris_1.5
[31] iterators_1.0.13 glue_1.5.1 gtable_0.3.0 webshot_0.5.2 Quandl_2.11.0 Rttf2pt1_1.3.9
[37] shape_1.4.6 DBI_1.1.1 GGally_2.1.2 Rcpp_1.0.7 units_0.7-2 foreign_0.8-81
[43] proxy_0.4-26 DT_0.20 htmlwidgets_1.5.4 httr_1.4.2 RColorBrewer_1.1-2 ellipsis_0.3.2
[49] farver_2.1.0 pkgconfig_2.0.3 reshape_0.8.8 sass_0.4.0 dbplyr_2.1.1 utf8_1.2.2
[55] labeling_0.4.2 tidyselect_1.1.1 rlang_0.4.12 munsell_0.5.0 cellranger_1.1.0 tools_4.1.1
[61] cli_3.1.0 generics_0.1.1 broom_0.7.10 evaluate_0.14 fastmap_1.1.0 yaml_2.2.1
[67] future_1.23.0 xml2_1.3.3 compiler_4.1.1 rstudioapi_0.13 curl_4.3.2 e1071_1.7-9
[73] reprex_2.0.1 bslib_0.3.1 stringi_1.7.6 lattice_0.20-44 classInt_0.4-3 vctrs_0.3.8
[79] pillar_1.6.4 lifecycle_1.0.1 furrr_0.2.3 jquerylib_0.1.4 data.table_1.14.2 maptools_1.1-2
[85] R6_2.5.1 KernSmooth_2.23-20 gridExtra_2.3 parallelly_1.29.0 codetools_0.2-18 boot_1.3-28
[91] gtools_3.9.2 assertthat_0.2.1 rprojroot_2.0.2 withr_2.4.3 parallel_4.1.1 hms_1.1.1
[97] quadprog_1.5-8 grid_4.1.1 class_7.3-19 rmarkdown_2.11 sf_1.0-4
getwd()
[1] "C:/Users/Willis Admin/Google Drive/EMBA/8 - BUS_696/final_project/BROCODE_Final_Project"
# load all your libraries in this chunk
library('tidyverse')
library("fs")
library('here')
library('dplyr')
library('tidyverse')
library('ggplot2')
library('ggrepel')
library('ggthemes')
library('forcats')
library('rsample')
library('lubridate')
library('ggthemes')
library('kableExtra')
library('pastecs')
library('viridis')
library('plotly')
library('tidyquant')
library('scales')
library("gdata")
library("corrplot")
library("urbnmapr")
library("usmap")
library("usdata")
library("tidycensus")
library("urbnthemes")
library("randomForest")
library("randomForestExplainer")
library('glmnet')
library('glmnetUtils')
library('plotROC')
#install.packages("remotes")
#remotes::install_github("UrbanInstitute/urbnthemes", build_vignettes = TRUE)
# note, do not run install.packages() inside a code chunk. install them in the console outside of a code chunk.
1a) Loading data
#Reading the data in and doing minor initial cleaning in the function call
#Reproducible data analysis should avoid all automatic string to factor conversions.
#strip.white removes white space
#na.strings is a substitution so all that have "" will = na
data <- read.csv(here::here("final_project", "donor_data.csv"),
stringsAsFactors = FALSE,
strip.white = TRUE,
na.strings = "")
1b) Fixing the wonky DOB & Data cleanup
#(Birthdate and Age, ID as a number)adding DOB (Age/Spouse Age) in years columns and adding two fields for assignment and number of children and number of degrees
dataclean <- data %>%
mutate(Birthdate = ifelse(Birthdate == "0001-01-01", NA, Birthdate)) %>%
mutate(Birthdate = mdy(Birthdate)) %>%
mutate(Age = as.numeric(floor(interval(start= Birthdate, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(Spouse.Birthdate = ifelse(Spouse.Birthdate == "0001-01-01", NA, Spouse.Birthdate)) %>%
mutate(Spouse.Birthdate = mdy(Spouse.Birthdate)) %>%
mutate(Spouse.Age = as.numeric(floor(interval(start= Spouse.Birthdate,
end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(ID = as.numeric(ID)) %>%
mutate(Assignment_flag = ifelse(is.na(Assignment.Number), 0,1)) %>%
mutate( No_of_Children = ifelse(is.na(Child.1.ID),0,
ifelse(is.na(Child.2.ID),1,2)))%>%
mutate(ID = as.numeric(ID)) %>%
mutate( nmb_degree = ifelse(is.na(Degree.Type.1),0,
ifelse(is.na(Degree.Type.2),1,2))) %>%
#conferral dates
mutate(Conferral.Date.1 = ifelse(Conferral.Date.1 == "0001-01-01", NA, Conferral.Date.1)) %>%
mutate(Conferral.Date.1 = mdy(Conferral.Date.1)) %>%
mutate(Conferral.Date.1.Age = as.numeric(floor(interval(start= Conferral.Date.1, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(Conferral.Date.2 = ifelse(Conferral.Date.2 == "0001-01-01", NA, Conferral.Date.2)) %>%
mutate(Conferral.Date.2 = mdy(Conferral.Date.2)) %>%
mutate(Conferral.Date.2.Age = as.numeric(floor(interval(start= Conferral.Date.2, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(Last.Contact.By.Anyone = ifelse(Last.Contact.By.Anyone == "0001-01-01", NA, Last.Contact.By.Anyone)) %>%
mutate(Last.Contact.By.Anyone = mdy(Last.Contact.By.Anyone)) %>%
mutate(Last.Contact.Age = as.numeric(floor(interval(start= Last.Contact.By.Anyone, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(HH.First.Gift.Date = ifelse(HH.First.Gift.Date == "0001-01-01", NA, HH.First.Gift.Date)) %>%
mutate(HH.First.Gift.Date = mdy(HH.First.Gift.Date)) %>%
mutate(HH.First.Gift.Age = as.numeric(floor(interval(start= HH.First.Gift.Date, end=Sys.Date())/duration(n=1, unit="years")))) %>%
#major gift
mutate(major_gifter = ifelse(Lifetime.Giving > 50000, 1,0) %>% factor(., levels = c("0","1"))) %>%
#splitting up the age into ranges and creating category for easy visualization
mutate(age_range =
ifelse(Age %in% 10:19, "10 < 20 years old",
ifelse(Age %in% 20:29, "20 < 30 years old",
ifelse(Age %in% 30:39, "30 < 40 years old",
ifelse(Age %in% 40:49, "40 < 50 years old",
ifelse(Age %in% 50:59, "50 < 60 years old",
ifelse(Age %in% 60:69, "60 < 70 years old",
ifelse(Age %in% 70:79, "70 < 80 years old",
ifelse(Age %in% 80:89, "80 < 90 years old",
ifelse(Age %in% 90:120, "90+ years old",
NA)))))))))) %>%
#creating a region column using the county data and the OMB MSA (Metropolitan Statistical Area) definitions
ifelse(County == "San Luis Obispo" & State == "CA", "So Cal",
ifelse(County == "San Bernardino" & State == "CA", "So Cal",
ifelse(County == "Santa Barbara" & State == "CA", "So Cal",
ifelse(County == "Ventura" & State == "CA", "So Cal",
ifelse(County == "Los Angeles" & State == "CA", "So Cal",
ifelse(County == "Orange" & State == "CA", "So Cal",
ifelse(County == "Riverside" & State == "CA", "So Cal",
ifelse(County == "San Diego" & State == "CA", "So Cal",
ifelse(County == "Imperial" & State == "CA", "So Cal",
ifelse(County == "King" & State == "WA", "Seattle",
ifelse(County == "Snohomish" & State == "WA", "Seattle",
ifelse(County == "Pierce" & State == "WA", "Seattle",
ifelse(County == "Clackamas" & State == "OR", "Portland",
ifelse(County == "Columbia" & State == "OR", "Portland",
ifelse(County == "Multnomah" & State == "OR", "Portland",
ifelse(County == "Washington" & State == "OR", "Portland",
ifelse(County == "Yamhill" & State == "OR", "Portland",
ifelse(County == "Clark" & State == "WA", "Portland",
ifelse(County == "Skamania" & State == "WA", "Portland",
ifelse(County == "Denver" & State == "CO", "Denver",
ifelse(County == "Arapahoe" & State == "CO", "Denver",
ifelse(County == "Jefferson" & State == "CO", "Denver",
ifelse(County == "Adams" & State == "CO", "Denver",
ifelse(County == "Douglas" & State == "CO", "Denver",
ifelse(County == "Broomfield" & State == "CO", "Denver",
ifelse(County == "Elbert" & State == "CO", "Denver",
ifelse(County == "Park" & State == "CO", "Denver",
ifelse(County == "Clear Creek" & State == "CO", "Denver",
ifelse(County == "Alameda" & State == "CA", "Bay Area",
ifelse(County == "Contra Costa" & State == "CA", "Bay Area",
ifelse(County == "Marin" & State == "CA", "Bay Area",
ifelse(County == "Monterey" & State == "CA", "Bay Area",
ifelse(County == "Napa" & State == "CA", "Bay Area",
ifelse(County == "San Benito" & State == "CA", "Bay Area",
ifelse(County == "San Francisco" & State == "CA", "Bay Area",
ifelse(County == "San Mateo" & State == "CA", "Bay Area",
ifelse(County == "Santa Clara" & State == "CA", "Bay Area",
ifelse(County == "Santa Cruz" & State == "CA", "Bay Area",
ifelse(County == "Solano" & State == "CA", "Bay Area",
ifelse(County == "Sonoma" & State == "CA", "Bay Area",
NA)))))))))))))))))))))))))))))))))))))))))) %>%
mutate(region =
ifelse(County == "Kings" & State == "NY", "New York",
ifelse(County == "Queens" & State == "NY", "New York",
ifelse(County == "New York" & State == "NY", "New York",
ifelse(County == "Bronx" & State == "NY", "New York",
ifelse(County == "Richmond" & State == "NY", "New York",
ifelse(County == "Westchester" & State == "NY", "New York",
ifelse(County == "Bergen" & State == "NY", "New York",
ifelse(County == "Hudson" & State == "NY", "New York",
ifelse(County == "Passaic" & State == "NY", "New York",
ifelse(County == "Putnam" & State == "NY", "New York",
ifelse(County == "Rockland" & State == "NY", "New York",
ifelse(County == "Suffolk" & State == "NY", "New York",
ifelse(County == "Nassau" & State == "NY", "New York",
ifelse(County == "Middlesex" & State == "NJ", "New York",
ifelse(County == "Monmouth" & State == "NJ", "New York",
ifelse(County == "Ocean" & State == "NJ", "New York",
ifelse(County == "Somerset" & State == "NJ", "New York",
ifelse(County == "Essex" & State == "NJ", "New York",
ifelse(County == "Union" & State == "NJ", "New York",
ifelse(County == "Morris" & State == "NJ", "New York",
ifelse(County == "Sussex" & State == "NJ", "New York",
ifelse(County == "Hunterdon" & State == "NJ", "New York",
ifelse(County == "Pike" & State == "NJ", "New York",
region)))))))))))))))))))))))) %>%
# code nor cal region as all others in CA not already defined
mutate(region =
ifelse(State == "CA" & is.na(region) == TRUE, "Nor Cal", region))
#Initial Removal of Columns that provide no benefit
dataclean <- subset(dataclean,select = -c(Assignment.Number
,Assignment.has.Historical.Mngr
,Suffix
,Assignment.Date
,Assignment.Manager
,Assignment.Role
,Assignment.Title
,Assignment.Status
,Strategy
,Progress.Level
,Assignment.Group
,Assignment.Category
,Funding.Method
,Expected.Book.Date
,Qualification.Amount
,Expected.Book.Amount
,Expected.Book.Date
,Hard.Gift.Total
,Soft.Credit.Total
,Total.Assignment.Gifts
,No.of.Pledges
,Proposal..
,Proposal.Notes
,HH.Life.Spouse.Credit
,Last.Contact.By.Manager
,X..of.Contacts.By.Manager
,DonorSearch.Range
,iWave.Range
,WealthEngine.Range
,Philanthropic.Commitments
))
#cleaning up zip codes removing -4 after
dataclean$Zip <- gsub(dataclean$Zip, pattern="-.*", replacement = "")
#adding zip code data and column
zip <- read.csv(here::here("final_project", "Salary_Zipcode.csv"),
stringsAsFactors = FALSE,
strip.white = TRUE,
na.strings = "")
dataclean <-dataclean %>%
mutate(zipcode_slry = VLOOKUP(Zip, zip, NAME, S1902_C03_002E)) %>%
#slry range
mutate(zipslry_range =
ifelse(zipcode_slry %in% 10000:89999, "90K-99K",
ifelse(zipcode_slry %in% 90000:99999, "90K-99K",
ifelse(zipcode_slry %in% 100000:149999, "100K-149K",
ifelse(zipcode_slry %in% 150000:199999, "150K-199K",
ifelse(zipcode_slry %in% 200000:249999, "200K-249K",
ifelse(zipcode_slry %in% 250000:299999, "250K-299K",
ifelse(zipcode_slry %in% 300000:349999, "300K-349K",
ifelse(zipcode_slry %in% 350000:399999, "350K-399K",
ifelse(zipcode_slry %in% 400000:499999, "400K-499K",
ifelse(zipcode_slry %in% 500000:999999, "500K-999K",
NA)))))))))))
sum(is.na(dataclean$zipcode_slry))
[1] 62347
#converting married Y and N to 1 and 0
dataclean <- dataclean %>%
mutate(Married_simple = ifelse(Married == "N",0,1))
dataclean <- dataclean %>%
mutate(hh.lifetime.giving_fct = as.factor(HH.Lifetime.Giving)) %>%
mutate(HH.Lifetime.Giving.Plus = log(HH.Lifetime.Giving + 1)) %>%
mutate(Lifetime.Giving.Plus = log(Lifetime.Giving + 1)) %>%
mutate(average_yearly_donation = (HH.Total.Gifts.FY20.21+HH.Total.Gifts.FY19.20+HH.Total.Gifts.FY18.19+HH.Total.Gifts.FY17.18+HH.Total.Gifts.FY16.17)/5)
#adding scholarship data (y/n)
schlr <- read.csv(here::here("final_project", "scholarship.csv"),
stringsAsFactors = FALSE,
strip.white = TRUE,
na.strings = "")
#adding scholarship column
dataclean <-dataclean %>%
mutate(scholarship = VLOOKUP(ID, schlr, ID, SCHOLARSHIP))
#replacing NA with 0
dataclean$scholarship <- replace_na(dataclean$scholarship,'0')
#replacing Y with 1
dataclean$scholarship<-ifelse(dataclean$scholarship=="Y",1,0)
#checking how many are N
table(dataclean$scholarship)
0 1
295264 27962
#checking and deleting scholarship column
class(dataclean$schlr_fct)
[1] "NULL"
dataclean = subset(dataclean, select = -c(scholarship))
#checking for duplicates N >1 indicates a records values are in the file twice
dataclean %>% group_by(ID) %>% count() %>% arrange(desc(n))
#removing duplicated records
dataclean <- unique(dataclean)
#Verifying n = 1 no ID with multiple records cleaned of dupes
dataclean %>% group_by(ID) %>% count() %>% arrange(desc(n))
NA
1d Creating many many factor variables
dataclean <-
dataclean %>%
#SEX
mutate(sex_fct =
fct_explicit_na(Sex),
sex_simple =
fct_lump_n(Sex, n = 4),
#MARRIED
married_fct =
fct_explicit_na(Married),
#DONOR SEGMENT
donorseg_fct =
fct_explicit_na(Donor.Segment),
donorseg_simple =
fct_lump_n(Donor.Segment, n = 4),
#CONTACT RULE
contact_fct =
fct_explicit_na(Contact.Rules),
contact_simple =
fct_lump_n(Contact.Rules, n = 4),
#SPOUSE MAIL
spomail_fct =
fct_explicit_na(Spouse.Mail.Rules),
spomail_simple =
fct_lump_n(Spouse.Mail.Rules, n = 4),
#JOB TITLE
jobtitle_fct =
fct_explicit_na(Job.Title),
jobtitle_simple =
fct_lump_n(Job.Title, n = 5),
#DEGREE TYPE 1
deg1_fct =
fct_explicit_na(Degree.Type.1),
deg1_simple =
fct_lump_n(Degree.Type.1, n = 5),
#DEGREE TYPE 2
deg2_fct =
fct_explicit_na(Degree.Type.2),
deg2_simple =
fct_lump_n(Degree.Type.2, n = 5),
#MAJOR 1
maj1_fct =
fct_explicit_na(Major.1),
maj1_simple =
fct_lump_n(Major.1, n = 5),
#MAJOR 2
maj2_fct =
fct_explicit_na(Major.2),
maj2_simple =
fct_lump_n(Major.2, n = 5),
#MINOR 1
min1_fct =
fct_explicit_na(Minor.1),
min1_simple =
fct_lump_n(Minor.1, n = 5),
#MINOR 2
min2_fct =
fct_explicit_na(Minor.2),
min2_simple =
fct_lump_n(Minor.2, n = 5),
#SCHOOL 1
school1_fct =
fct_explicit_na(School.1),
school1_simple =
fct_lump_n(School.1, n = 5),
#SCHOOL 2
school2_fct =
fct_explicit_na(School.2),
school2_simple =
fct_lump_n(School.2, n = 5),
#INSTITUTION TYPE
insttype_fct =
fct_explicit_na(Institution.Type),
insttype_simple =
fct_lump_n(Institution.Type, n = 5),
#EXTRACURRICULAR
extra_fct =
fct_explicit_na(Extracurricular),
extra_simple =
fct_lump_n(Extracurricular, n = 5),
#HH FIRST GIFT FUND
hhfirstgift_fct =
fct_explicit_na(HH.First.Gift.Fund),
hhfirstgift_simple =
fct_lump_n(HH.First.Gift.Fund, n = 5),
#CHILD 1 ENROLL STATUS
ch1_enroll_fct =
fct_explicit_na(Child.1.Enroll.Status),
ch1_enroll_simple =
fct_lump_n(Child.1.Enroll.Status, n = 4),
#CHILD 1 MAJOR
ch1_maj_fct =
fct_explicit_na(Child.1.Major),
ch1_maj_simple =
fct_lump_n(Child.1.Major, n = 4),
#CHILD 1 MINOR
ch1_min_fct =
fct_explicit_na(Child.1.Minor),
ch1_min_simple =
fct_lump_n(Child.1.Minor, n = 4),
#CHILD 1 SCHOOL
ch1_school_fct =
fct_explicit_na(Child.1.School),
ch1_school_simple =
fct_lump_n(Child.1.School, n = 4),
#CHILD 1 FEEDER
ch1_feeder_fct =
fct_explicit_na(Child.1.Feeder.School),
ch1_feeder_simple =
fct_lump_n(Child.1.Feeder.School, n = 4),
#CHILD 2 ENROLL STATUS
ch1_enroll_fct =
fct_explicit_na(Child.2.Enroll.Status),
ch2_enroll_simple =
fct_lump_n(Child.2.Enroll.Status, n = 4),
ch2_maj_fct =
ch2_maj_simple =
fct_lump_n(Child.2.Major, n = 4),
#CHILD 2 MINOR
ch2_min_fct =
fct_explicit_na(Child.2.Minor),
ch2_min_simple =
fct_lump_n(Child.2.Minor, n = 4),
#CHILD 2 SCHOOL
ch2_school_fct =
fct_explicit_na(Child.2.School),
ch2_school_simple =
fct_lump_n(Child.2.School, n = 4),
#CHILD 2 FEEDER
ch2_feeder_fct =
fct_explicit_na(Child.2.Feeder.School),
ch2_feeder_simple =
fct_lump_n(Child.2.Feeder.School, n = 4),
#Region
region_fct =
fct_explicit_na(region),
region_simple =
fct_lump_n(region, n = 5),
#Age
age_range_fct =
fct_explicit_na(age_range),
)
#checking to see if its a factor
#class(dataclean$sex_fct)
#class(dataclean$donorseg_fct)
#class(dataclean$contact_fct)
#class(dataclean$spomail_fct)
#checking levels
#levels(dataclean$sex_simple)
#levels(dataclean$donorseg_simple)
#levels(dataclean$contact_simple)
#levels(dataclean$spomail_simple)
#levels(dataclean$hhfirstgift_simple)
#creating a table against Sex column
#table(dataclean$sex_fct, dataclean$sex_simple)
Region Analysis
#grouping by region and analyzing
dataclean %>%
filter(HH.Lifetime.Giving != 0)%>% #filtering out the 0s per our presentation and recommendations
group_by(region) %>%
summarise(Count = length(region),
median_total_giv = median(HH.Lifetime.Giving)) %>%
arrange(-Count) %>%
filter(Count >= 100) %>%
mutate(median_total_giv = dollar(median_total_giv)) %>%
kable(col.names = c("Region", "Count", "Median HH Lifetime Giving"), align=rep('c', 3)) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
| Region | Count | Median HH Lifetime Giving |
|---|---|---|
| So Cal | 49438 | $100 |
| NA | 28462 | $77 |
| Bay Area | 6046 | $100 |
| Nor Cal | 2892 | $100 |
| Seattle | 1753 | $100 |
| Portland | 902 | $100 |
| Denver | 802 | $75 |
| New York | 731 | $170 |
NA
DonorSegment Analysis
#grouping by donorsegment and analyzing
dataclean %>%
filter(HH.Lifetime.Giving != 0)%>% #filtering 0s and adding median per
group_by(Donor.Segment) %>%
summarise(Count = length(Donor.Segment),
median_total_giv = median(HH.Lifetime.Giving)) %>%
arrange(-Count) %>%
filter(Count >= 100) %>%
#added scales package to have the values show in dollar
mutate(median_total_giv = (dollar(median_total_giv))) %>%
kable(col.names = c("Donor Segment", "Count", "Median HH Lifetime Giving"), align=rep('c', 3)) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
| Donor Segment | Count | Median HH Lifetime Giving |
|---|---|---|
| Lost Donor | 69718 | $85.00 |
| Lapsed Donor | 11193 | $130.83 |
| Current Donor | 5603 | $637.50 |
| Lapsing Donor | 3862 | $250.00 |
| At-Risk Donor | 650 | $750.00 |
NA
NA
First gift size analysis
dataclean %>%
filter(HH.Lifetime.Giving != 0)
aq <- quantile(dataclean$HH.First.Gift.Amount, probs = c(.25,.50,.75,.9,.99), na.rm = TRUE)
aq <- as.data.frame(aq)
aq$aq <- dollar(aq$aq)
aq %>%
kable(col.names = "Quantile") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
| Quantile | |
|---|---|
| 25% | $0.00 |
| 50% | $0.00 |
| 75% | $0.00 |
| 90% | $40.00 |
| 99% | $1,910.06 |
NA
NA
Consecutive giving
#consecutive years of giving
dataclean %>%
filter(Max.Consec.Fiscal.Years > 0) %>%
ggplot(aes(Max.Consec.Fiscal.Years)) + geom_histogram(fill = "#002845", bins = 20) +
theme_economist_white() +
ggtitle("Consecutive Years of Giving Distribution") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,2)) +
scale_y_continuous(breaks = seq(0,10000000,5000))
NA
NA
NA
Lifetime giving based on number of children
dataclean %>%
filter(HH.Lifetime.Giving <= 10000) %>%
filter(HH.Lifetime.Giving > 0) %>%
mutate(`No_of_Children` = as.factor(`No_of_Children`)) %>%
ggplot(aes(HH.Lifetime.Giving, fill = `No_of_Children`)) + geom_histogram(bins = 30) + theme_economist_white() +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,100000,1000)) +
scale_y_continuous(breaks = seq(0,100000000,5000)) +
ggtitle("Giving distribution and number of children")+
scale_fill_manual(values=c("#002845", "#00cfcc", "#ff9973"))
NA
NA
NA
Mean, Median, and Count of Giving in Age Ranges
age_range_giving <- dataclean %>%
group_by(age_range) %>%
summarise(avg_giving = mean(HH.Lifetime.Giving, na.rm = TRUE),
med_giving = median(HH.Lifetime.Giving, na.rm = TRUE),
amount_of_people_in_age_range = n())
glimpse(age_range_giving)
Rows: 10
Columns: 4
$ age_range <chr> "10 < 20 years old", "20 < 30 years old", "30 < 40 years old", "40 < 50 years old~
$ avg_giving <dbl> 0.4501597, 28.2527107, 386.5961310, 810.4033032, 2778.4964811, 5400.1138522, 1156~
$ med_giving <dbl> 0, 0, 0, 0, 0, 0, 0, 10, 15, 0
$ amount_of_people_in_age_range <int> 3944, 24569, 21021, 16829, 20742, 18231, 12204, 5951, 6638, 192871
Plotting average giving by age range
age_range_giving <-
age_range_giving %>%
mutate(age_range = factor(age_range))
ggplot(age_range_giving, aes(age_range, avg_giving)) +
geom_bar(stat = "identity")+
theme(axis.text.x = element_text(angle=45,
hjust=1)) + labs(x = "Age Range", y = "Average Giving") +
ggtitle("Average Giving Compared Across Age Ranges")
NA
NA
Count of donors based on age range (another way to look at it)
ggplot(dataclean,
aes(age_range)) +
geom_bar() +
theme(axis.text.x = element_text(angle=45,
hjust=1)) +
labs(title = "Count of Age Ranges", x = "", y = "")
NA
NA
Boxplot of the Age Ranges Against the Lifetime Giving Amounts with a log scale applied - the reason we applied log scale is to resolve issues with visualizations that skew towards large values in our dataset.
THIS CHUCNK WOULD NOT RUN ON FINAL COMPILATION. IT CLEARLY RAN DURING THE PREP OF OUR POWERPOINT PRESENTATION
2d) Splitting by age and gender
#creating boxplots
dataclean %>%
filter(Age < 100) %>% #removing the weird outliers that are over 100
filter(Sex %in% c("M", "F")) %>%
ggplot(aes(Sex, Age)) +
geom_boxplot() +
theme_economist() +
ggtitle("Ages of Donors Based on Gender") +
xlab(NULL) + ylab(NULL)
NA
NA
Giving by gender
#remove NAs U X
dataclean2 <- dataclean %>%
filter(Sex %in% c("M", "F"))
q <- ggplot(dataclean2)
q + stat_summary_bin(
aes(y = HH.Lifetime.Giving, x = Sex),
fun.y = "mean", geom = "bar")
summary(dataclean$sex_simple)
F M U X NA's
120781 108190 3683 7 90339
Mean age by gender
#breakdown of sexs
tally(group_by(dataclean, Sex))
summarize(group_by(dataclean, Sex),
avg_giving = mean(HH.Lifetime.Giving, na.rm = TRUE),
avg_age = mean(Age, na.rm = TRUE),
med_age = median(Age, na.rm = TRUE))
#grouping by sex and age range for slides
tally(group_by(dataclean, Sex, age_range))
NA
NA
NA
2e) Distribution of people in the states that they live.
dataclean %>%
mutate(State = ifelse(State == " ", "NA", State)) %>%
filter(State != "NA") %>%
group_by(State) %>%
summarise(Count = length(State)) %>%
filter(Count > 800) %>%
arrange(-Count) %>%
kable(col.names = c("Donor's State", "Count")) %>%
kable_styling(bootstrap_options = c("condensed"),
full_width = F)
| Donor's State | Count |
|---|---|
| CA | 176487 |
| WA | 7957 |
| TX | 7266 |
| NY | 5659 |
| CO | 5073 |
| AZ | 4925 |
| OR | 4612 |
| FL | 4111 |
| IL | 3681 |
| HI | 3394 |
| PA | 2904 |
| OH | 2754 |
| NV | 2715 |
| MI | 2523 |
| MA | 2473 |
| NJ | 2311 |
| VA | 2158 |
| NC | 2085 |
| GA | 2044 |
| MO | 1889 |
| MN | 1732 |
| MD | 1488 |
| TN | 1443 |
| IN | 1417 |
| CT | 1380 |
| WI | 1330 |
| UT | 1173 |
| OK | 1151 |
| AL | 1120 |
| LA | 1110 |
| ID | 1096 |
| SC | 1076 |
| KY | 1032 |
| KS | 1027 |
| NM | 981 |
| IA | 880 |
NA
NA
2f) Looking at all donors first gift amount. 75% made a first gift of <100.
no_non_donors <- dataclean %>%
filter(Lifetime.Giving != 0)
nd <- quantile(no_non_donors$HH.First.Gift.Amount, probs = c(.25,.50,.75,.9,.99), na.rm = TRUE)
nd <- as.data.frame(nd)
nd %>%
kable(col.names = "Quantile") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
| Quantile | |
|---|---|
| 25% | 3.8 |
| 50% | 25.0 |
| 75% | 100.0 |
| 90% | 500.0 |
| 99% | 15000.0 |
NA
NA
NA
NA
p <- dataclean %>%
ggplot(aes(Age)) + geom_histogram(bins=30, fill = "blue") + theme_economist_white() +
ggtitle("Overall Donor Age Distribution") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(5,100,by = 20)) +
scale_y_continuous(breaks = seq(20,100,by = 20)) + xlim(c(20,100))
Scale for 'x' is already present. Adding another scale for 'x', which will replace the existing scale.
ggplotly(p)
p
ggplot(data = dataclean, aes(x = Age)) + geom_histogram(fill ="blue")+ xlim(c(20,100))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
NA
NA
NA
Another Histogram
dataclean %>%
filter(Age >= 10) %>%
filter(Age <= 90) %>%
ggplot(aes(Age)) + geom_histogram(fill = "#002845", bins = 20) + theme_economist_white() +
ggtitle("Overall Donor Age Distribution") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,5)) +
scale_y_continuous(breaks = seq(0,10000000,2000))
Age distribution by gender
#Age Gender filtered out below 15 and above 90 - also removed U X the weird values
dataclean %>%
filter(Age >= 15) %>%
filter(Age <= 90) %>%
mutate(Sex = as.factor(Sex)) %>%
filter(Sex != "U") %>%
filter(Sex != "X") %>%
ggplot(aes(Age, fill = Sex)) + geom_histogram(bins = 25) + theme_economist_white() +
ggtitle("Age Distribution by Gender") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,10)) +
scale_y_continuous(breaks = seq(0,50000,2000)) + scale_fill_manual(values=c("#ff9973", "#00cfcc"))
Donor age distribution by marital status
#Age Marital Status
dataclean %>%
filter(Age >= 20) %>%
filter(Age <= 85) %>%
ggplot(aes(Age, fill = Married)) + geom_histogram(bins = 25) + theme_economist_white() +
ggtitle("Overall Donor Age Distribution by Marital Status") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,5)) +
scale_y_continuous(breaks = seq(0,50000,2000)) + scale_fill_manual(values=c("#ff9973", "#00cfcc"))
Mapping
#add county after county name in order to use Urbnmapr
dataclean <- dataclean %>%
mutate(County =
ifelse(is.na(County) == TRUE, County, paste0(County, " County")))
dc_state_county <- dataclean %>%
mutate(polyname =
ifelse(is.na(County) == TRUE, County, paste0(State, ",",County)))
dc_state_county2 <- fips_codes %>%
mutate(polyname = paste0(state, ",",county)) %>%
mutate(county_fips = paste0(state_code,county_code))
#Bring in the FIPS data to our data set since UrbnMapr uses FIPS
countymap_data <- left_join(dc_state_county, dc_state_county2, by = "polyname")
spatial_data <- left_join(get_urbn_map(map = "counties", sf = TRUE),
countymap_data,
by = "county_fips")
old-style crs object detected; please recreate object with a recent sf::st_crs()
# plot maps
ggplot() +
geom_sf(spatial_data,
mapping = aes(fill = HH.Lifetime.Giving),
color = "#ffffff", size = 0.50) +
labs(fill = "Household Lifetime Giving")
county_sf <- get_urbn_map(map = "counties", sf = TRUE)
county_sf %>%
left_join(countymap_data, by = "county_fips") %>%
ggplot() +
geom_sf(mapping = aes(fill = HH.Lifetime.Giving),
color = "#ffffff", size = 0.25) +
theme_economist_white() +
scale_fill_gradient(low = ("lightblue"),high = ("blue"),labels = scales::dollar) +
labs(fill = "Household Lifetime Giving") +
coord_sf(datum = NA)
old-style crs object detected; please recreate object with a recent sf::st_crs()
spatial_data %>%
filter(state_name.x == "California") %>%
filter(HH.Lifetime.Giving != 0)%>%
filter(HH.Lifetime.Giving %in% (100:1500000)) %>%
ggplot() +
geom_sf(mapping = aes(fill = log(HH.Lifetime.Giving),na.rm=TRUE),
color = "#666666", size = 0.10) +
coord_sf(datum = NA) +
scale_fill_gradient(low = ("lightblue"),high = ("blue"),labels = scales::dollar) +
labs(fill = "Household Lifetime Giving")
spatial_data %>%
filter(state_name.x == "Florida") %>%
filter(HH.Lifetime.Giving != 0)%>%
filter(HH.Lifetime.Giving %in% (100:1500000)) %>%
ggplot() +
geom_sf(mapping = aes(fill = log(HH.Lifetime.Giving),na.rm=TRUE),
color = "#666666", size = 0.025) +
coord_sf(datum = NA) +
scale_fill_gradient(low = ("lightblue"),high = ("blue"),labels = scales::dollar) +
labs(fill = "Household Lifetime Giving")
spatial_data %>%
filter(HH.Lifetime.Giving %in% (100:1500000)) %>%
filter(HH.Lifetime.Giving > 0)%>%
ggplot() +
geom_sf(mapping = aes(fill = log(HH.Lifetime.Giving),na.rm=TRUE, show.legend = "polygon"),
color = "#666666", size = 0.10) +
coord_sf(datum = NA) +
scale_fill_gradient(low = ("lightblue"),high = ("blue"),labels = scales::dollar) +
labs(fill = "Household Lifetime Giving")
#cannot figure out how to exp the legend value after logging HH.Lifetime.Giving
Splitting data and creating a new set for easier analysis
data_split <- initial_split(dataclean, prop = 0.75)
data_train <- training(data_split)
data_test <- testing(data_split)
Linear Models
#These will focus on predicting whether a constituent is a donor or non-donor.
mod1lm <- lm( Lifetime.Giving ~ Married_simple,
data = data_train)
mod2lm <- lm( Total.Giving.Years ~ Lifetime.Giving,
data = data_train)
mod3lm <- lm( Lifetime.Giving ~ region,
data = data_train)
summary(mod1lm)
Call:
lm(formula = Lifetime.Giving ~ Married_simple, data = data_train)
Residuals:
Min 1Q Median 3Q Max
-3245 -3119 -2354 -2354 15826594
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2354.1 235.2 10.010 <0.0000000000000002 ***
Married_simple 890.5 438.8 2.029 0.0424 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 97730 on 242248 degrees of freedom
Multiple R-squared: 1.7e-05, Adjusted R-squared: 1.287e-05
F-statistic: 4.118 on 1 and 242248 DF, p-value: 0.04244
summary(mod2lm)
Call:
lm(formula = Total.Giving.Years ~ Lifetime.Giving, data = data_train)
Residuals:
Min 1Q Median 3Q Max
-40.329 -0.552 -0.552 -0.552 39.403
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.55182714180 0.00394457013 139.90 <0.0000000000000002 ***
Lifetime.Giving 0.00000363288 0.00000004035 90.04 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.941 on 242248 degrees of freedom
Multiple R-squared: 0.03238, Adjusted R-squared: 0.03238
F-statistic: 8106 on 1 and 242248 DF, p-value: < 0.00000000000000022
summary(mod3lm)
Call:
lm(formula = Lifetime.Giving ~ region, data = data_train)
Residuals:
Min 1Q Median 3Q Max
-3676 -3676 -3676 -3102 15074639
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 541.85 837.97 0.647 0.517876
regionDenver -380.47 2425.77 -0.157 0.875367
regionNew York 1882.25 1906.28 0.987 0.323450
regionNor Cal 2560.56 1436.20 1.783 0.074610 .
regionPortland -248.66 2366.39 -0.105 0.916311
regionSeattle 37.53 1843.88 0.020 0.983763
regionSo Cal 3134.07 895.59 3.499 0.000466 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 104300 on 144506 degrees of freedom
(97737 observations deleted due to missingness)
Multiple R-squared: 0.0001275, Adjusted R-squared: 8.596e-05
F-statistic: 3.071 on 6 and 144506 DF, p-value: 0.005258
ggplot(data = data_train, aes(x = Age, y = log(HH.Lifetime.Giving))) + geom_point(alpha = 1/10) + geom_smooth(method = lm) + facet_wrap(~region) + theme_clean(base_size = 8) + labs(x = "X", y = "Y") +
ggtitle("Region")
`geom_smooth()` using formula 'y ~ x'
ggplot(data = data_train, aes(x = Age, y = log(HH.Lifetime.Giving))) + geom_point(alpha = 1/10) + geom_smooth(method = lm) + facet_wrap(~nmb_degree) + theme_clean(base_size = 8) + labs(x = "X", y = "Y") +
ggtitle("Number of Degrees")
`geom_smooth()` using formula 'y ~ x'
ggplot(data = data_train, aes(x = Age, y = log(HH.First.Gift.Amount))) + geom_point(alpha = 1/10) + geom_smooth(method = lm) + facet_wrap(~donorseg_fct) + theme_clean(base_size = 8) + labs(x = "X", y = "Y") +
ggtitle("Donor Segment")
`geom_smooth()` using formula 'y ~ x'
#This plot actually has some interesting results
ggplot(data = data_train, aes(x = Age, y = log(Lifetime.Giving))) + geom_point(alpha = 1/10) + geom_smooth(method = lm) + facet_wrap(~No_of_Children) + theme_clean(base_size = 8) + labs(x = "X", y = "Y") +
ggtitle("# Children")
`geom_smooth()` using formula 'y ~ x'
data_train %>%
select_if(is.factor) %>%
glimpse()
Rows: 242,250
Columns: 57
$ major_gifter <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
$ sex_fct <fct> F, M, F, F, (Missing), M, F, M, M, F, (Missing), (Missing), (Missing), (Missing), M, (Mi~
$ sex_simple <fct> F, M, F, F, NA, M, F, M, M, F, NA, NA, NA, NA, M, NA, F, M, F, F, F, F, NA, M, NA, M, M,~
$ married_fct <fct> N, Y, N, N, N, N, Y, Y, Y, N, N, N, N, N, N, N, N, Y, N, N, Y, N, N, N, N, Y, N, N, Y, Y~
$ donorseg_fct <fct> Lost Donor, Lapsed Donor, (Missing), (Missing), (Missing), (Missing), Current Donor, (Mi~
$ donorseg_simple <fct> Lost Donor, Lapsed Donor, NA, NA, NA, NA, Current Donor, NA, Lapsing Donor, Lost Donor, ~
$ contact_fct <fct> (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), ~
$ contact_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, No Phonathon Calls, NA, ~
$ spomail_fct <fct> (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), ~
$ spomail_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, No Phona~
$ jobtitle_fct <fct> (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), President, ~
$ jobtitle_simple <fct> NA, NA, NA, NA, NA, NA, NA, President, Other, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
$ deg1_fct <fct> (Missing), (Missing), (Missing), Bachelor of Arts, (Missing), (Missing), Bachelor of Art~
$ deg1_simple <fct> NA, NA, NA, Bachelor of Arts, NA, NA, Bachelor of Arts, Other, NA, Bachelor of Arts, NA,~
$ deg2_fct <fct> (Missing), (Missing), (Missing), Master of Arts, (Missing), (Missing), (Missing), (Missi~
$ deg2_simple <fct> NA, NA, NA, Master of Arts, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, Master of Ar~
$ maj1_fct <fct> (Missing), (Missing), (Missing), Integrated Educ Studies BA, (Missing), (Missing), Socia~
$ maj1_simple <fct> NA, NA, NA, Other, NA, NA, Other, Other, NA, Other, NA, NA, NA, NA, NA, NA, Other, Other~
$ maj2_fct <fct> (Missing), (Missing), (Missing), Special Education MA, (Missing), (Missing), (Missing), ~
$ maj2_simple <fct> NA, NA, NA, Other, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, Other, NA, NA, NA, NA~
$ min1_fct <fct> (Missing), (Missing), (Missing), Psychology min, (Missing), (Missing), (Missing), (Missi~
$ min1_simple <fct> NA, NA, NA, Psychology min, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, Other, NA, N~
$ min2_fct <fct> (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), ~
$ min2_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ school1_fct <fct> (Missing), (Missing), (Missing), Donna Ford Attallah College of Educational Studies, (Mi~
$ school1_simple <fct> NA, NA, NA, Donna Ford Attallah College of Educational Studies, NA, NA, NA, George L. Ar~
$ school2_fct <fct> (Missing), (Missing), (Missing), Donna Ford Attallah College of Educational Studies, (Mi~
$ school2_simple <fct> NA, NA, NA, Donna Ford Attallah College of Educational Studies, NA, NA, NA, NA, NA, NA, ~
$ insttype_fct <fct> (Missing), (Missing), Undergraduate Degree | Undergraduate Degree | Undergraduate Degree~
$ insttype_simple <fct> NA, NA, Undergraduate Degree | Undergraduate Degree | Undergraduate Degree, Other, NA, U~
$ extra_fct <fct> (Missing), (Missing), (Missing), Delta Delta Delta|Snow Club|Snow Club|Snow Club|Snow Cl~
$ extra_simple <fct> NA, NA, NA, Other, NA, NA, Chapman Choir Tour, NA, NA, Other, NA, NA, NA, NA, NA, NA, Ch~
$ hhfirstgift_fct <fct> (Missing), Chapman Celebrates, (Missing), (Missing), (Missing), (Missing), (Missing), (M~
$ hhfirstgift_simple <fct> NA, Other, NA, NA, NA, NA, NA, NA, NA, Other, NA, NA, NA, NA, NA, NA, NA, NA, NA, Pre-SR~
$ ch1_enroll_fct <fct> (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), ~
$ ch1_enroll_simple <fct> NA, Program Completed, NA, NA, NA, NA, NA, Program Active, Plan Change, NA, NA, NA, NA, ~
$ ch1_maj_fct <fct> Business Administration BS, Dance BA|Integrated Educ Studies BA, (Missing), (Missing), (~
$ ch1_maj_simple <fct> Business Administration BS, Other, NA, NA, NA, NA, NA, Other, Other, NA, NA, NA, NA, NA,~
$ ch1_min_fct <fct> (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), Advertising~
$ ch1_min_simple <fct> NA, NA, NA, NA, NA, NA, NA, Other, Other, NA, NA, NA, NA, NA, English min, NA, NA, Other~
$ ch1_school_fct <fct> George L. Argyros School of Business and Economics, Donna Ford Attallah College of Educa~
$ ch1_school_simple <fct> George L. Argyros School of Business and Economics, Other, NA, NA, NA, NA, NA, Wilkinson~
$ ch1_feeder_fct <fct> (Missing), Chaparral High School, (Missing), (Missing), (Missing), (Missing), (Missing),~
$ ch1_feeder_simple <fct> NA, Other, NA, NA, NA, NA, NA, Other, Other, NA, NA, NA, NA, NA, Other, NA, NA, NA, Othe~
$ ch2_enroll_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ ch2_maj_fct <fct> (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), ~
$ ch2_maj_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ ch2_min_fct <fct> (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), ~
$ ch2_min_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ ch2_school_fct <fct> (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), ~
$ ch2_school_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ ch2_feeder_fct <fct> (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), (Missing), ~
$ ch2_feeder_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ region_fct <fct> (Missing), So Cal, (Missing), So Cal, (Missing), (Missing), Nor Cal, Seattle, (Missing),~
$ region_simple <fct> NA, So Cal, NA, So Cal, NA, NA, Nor Cal, Seattle, NA, NA, NA, NA, NA, NA, So Cal, NA, NA~
$ age_range_fct <fct> (Missing), (Missing), 20 < 30 years old, 30 < 40 years old, (Missing), 20 < 30 years old~
$ hh.lifetime.giving_fct <fct> 2125, 1445, 0, 0, 0, 0, 1267, 0, 750, 125, 0, 0, 0, 0, 0, 0, 0, 60, 0, 175, 0, 0, 0, 0, ~
Initial Logistic models
#removing since we didn't use it 12/7
# Set family to binomial to set logistic function
# Run the model on the training set
# donor_logit1 <-
# glm(hh.lifetime.giving_fct ~ Married_simple,
# family = "binomial",
# data = data_train)
#
# summary(donor_logit1)
#
#
# donor_logit2 <-
# glm(hh.lifetime.giving_fct ~ No_of_Children,
# family = "binomial",
# data = data_train)
#
# summary(donor_logit2)
Large fully processed Logistic model
#summary(data_train$major_gifter)
#Assignment_flag taken out - may add back
donor_logit3 <-
glm(major_gifter ~ Married + No_of_Children + donorseg_simple + Total.Giving.Years + nmb_degree,
family = "binomial",
data = data_train)
summary(donor_logit3)
Call:
glm(formula = major_gifter ~ Married + No_of_Children + donorseg_simple +
Total.Giving.Years + nmb_degree, family = "binomial", data = data_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.0162 -0.1346 -0.1189 -0.0647 4.2151
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.623014 0.232406 -15.589 < 0.0000000000000002 ***
MarriedY -1.196678 0.088444 -13.530 < 0.0000000000000002 ***
No_of_Children 0.610558 0.060093 10.160 < 0.0000000000000002 ***
donorseg_simpleCurrent Donor -0.086079 0.238167 -0.361 0.7178
donorseg_simpleLapsed Donor -0.685781 0.244285 -2.807 0.0050 **
donorseg_simpleLapsing Donor -0.491440 0.259772 -1.892 0.0585 .
donorseg_simpleLost Donor -1.349829 0.232121 -5.815 0.00000000606 ***
Total.Giving.Years 0.206177 0.005601 36.809 < 0.0000000000000002 ***
nmb_degree -2.493578 0.146982 -16.965 < 0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 9606.7 on 68301 degrees of freedom
Residual deviance: 7052.9 on 68293 degrees of freedom
(173948 observations deleted due to missingness)
AIC: 7070.9
Number of Fisher Scoring iterations: 8
exp(donor_logit3$coefficients)
(Intercept) MarriedY No_of_Children donorseg_simpleCurrent Donor
0.02670207 0.30219634 1.84145937 0.91752216
donorseg_simpleLapsed Donor donorseg_simpleLapsing Donor donorseg_simpleLost Donor Total.Giving.Years
0.50369656 0.61174479 0.25928458 1.22897028
nmb_degree
0.08261383
#training predictions for in sample preds
preds_train <- predict(donor_logit3, newdata = data_train, type = "response")
#test predicts for OOS (out of sample)
preds_test <- predict(donor_logit3, newdata = data_test, type = "response")
head(preds_train)
236259 260845 287321 219737 27673 298107
0.015426706 0.009114444 NA NA NA NA
head(preds_test)
4 10 11 14 15 18
0.132366655 0.002564702 0.888420638 0.136046442 0.742195040 0.002087869
results_train <- data.frame(
`truth` = data_train %>% select(major_gifter) %>%
mutate(major_gifter = as.numeric(major_gifter)),
`Class1` = preds_train,
`type` = rep("train",length(preds_train))
)
results_test <- data.frame(
`truth` = data_test %>% select(major_gifter) %>%
mutate(major_gifter = as.numeric(major_gifter)),
`Class1` = preds_test,
`type` = rep("test",length(preds_test))
)
results <- bind_rows(results_train,results_test)
dim(results_train)
[1] 242250 3
dim(results_test)
[1] 80750 3
dim(results)
[1] 323000 3
p_plot <-
ggplot(results,
aes(m = Class1, d = major_gifter, color = type)) +
geom_roc(labelsize = 2.5,
#Took the labelsize down to avoid cutoff
cutoffs.at = c(0.7,0.5,0.3,0.1,0)) +
#We removed some of the cutoffs to avoid the mashup near the origin.
#Changed the theme to avoid cutoff plot values.
theme_classic(base_size = 14) +
labs(x = "False Positive Rate",
y = "True Positive Rate") +
ggtitle("ROC Plot: Training and Test")
print(p_plot)
p_train <-
ggplot(results_train,
aes(m = Class1, d = major_gifter, color = type)) +
geom_roc(labelsize = 3.5,
cutoffs.at = c(0.7,0.5,0.3,0.1,0)) +
theme_minimal(base_size = 16) +
labs(x = "False Positive Rate",
y = "True Positive Rate") +
ggtitle("ROC Plot: Training and Test")
p_test <-
ggplot(results_test,
aes(m = Class1, d = major_gifter, color = type)) +
geom_roc(labelsize = 3.5,
cutoffs.at = c(0.7,0.5,0.3,0.1,0)) +
theme_minimal(base_size = 16) +
labs(x = "False Positive Rate",
y = "True Positive Rate") +
ggtitle("ROC Plot: Training and Test")
Calculating AUC of logistic model 3
#summary(donor_logit3)
#coef(donor_logit3)
#Calculating AUC of both
print(calc_auc(p_train)$AUC)
[1] 0.8918463
print(calc_auc(p_test)$AUC)
[1] 0.8839279
Ridge model with more variables added
data_train %>% map(levels) %>% map(length)
$ID
[1] 0
$Birthdate
[1] 0
$HH.Total.Gifts.FY20.21
[1] 0
$HH.Total.Gifts.FY19.20
[1] 0
$HH.Total.Gifts.FY18.19
[1] 0
$HH.Total.Gifts.FY17.18
[1] 0
$HH.Total.Gifts.FY16.17
[1] 0
$Class.Year
[1] 0
$Category.Codes
[1] 0
$Category.Descriptions
[1] 0
$Spouse.Birthdate
[1] 0
$Spouse.Class.Year
[1] 0
$Sex
[1] 0
$Married
[1] 0
$City
[1] 0
$State
[1] 0
$Zip
[1] 0
$County
[1] 0
$Job.Title
[1] 0
$Degree.Type.1
[1] 0
$Degree.Type.2
[1] 0
$Conferral.Date.1
[1] 0
$Conferral.Date.2
[1] 0
$Major.1
[1] 0
$Major.2
[1] 0
$Minor.1
[1] 0
$Minor.2
[1] 0
$School.1
[1] 0
$School.2
[1] 0
$Institution.Type
[1] 0
$Athletics
[1] 0
$Extracurricular
[1] 0
$Child.1.ID
[1] 0
$Child.1.Class.Year
[1] 0
$Child.1.Enroll.Status
[1] 0
$Child.1.Major
[1] 0
$Child.1.Minor
[1] 0
$Child.1.School
[1] 0
$Child.1.Feeder.School
[1] 0
$Child.2.ID
[1] 0
$Child.2.Class.Year
[1] 0
$Child.2.Enroll.Status
[1] 0
$Child.2.Major
[1] 0
$Child.2.Minor
[1] 0
$Child.2.School
[1] 0
$Child.2.Feeder.School
[1] 0
$Last.Contact.By.Anyone
[1] 0
$Lifetime.Giving
[1] 0
$HH.Lifetime.Giving
[1] 0
$Total.Giving.Years
[1] 0
$Total.Giving.Fiscal.Years
[1] 0
$Max.Consec.Fiscal.Years
[1] 0
$HH.Life.Hard.Credit
[1] 0
$HH.Life.Soft.Credit
[1] 0
$Contact.Rules
[1] 0
$Spouse.Mail.Rules
[1] 0
$HH.First.Gift.Amount
[1] 0
$HH.First.Gift.Date
[1] 0
$HH.First.Gift.Fund
[1] 0
$LegacyLeader..compass.score.
[1] 0
$Months.Since.Last.Gift
[1] 0
$Donor.Segment
[1] 0
$Compass.Score
[1] 0
$Scholarship
[1] 0
$Age
[1] 0
$Spouse.Age
[1] 0
$Assignment_flag
[1] 0
$No_of_Children
[1] 0
$nmb_degree
[1] 0
$Conferral.Date.1.Age
[1] 0
$Conferral.Date.2.Age
[1] 0
$Last.Contact.Age
[1] 0
$HH.First.Gift.Age
[1] 0
$major_gifter
[1] 2
$age_range
[1] 0
$region
[1] 0
$zipcode_slry
[1] 0
$zipslry_range
[1] 0
$sex_fct
[1] 5
$sex_simple
[1] 4
$married_fct
[1] 2
$donorseg_fct
[1] 6
$donorseg_simple
[1] 5
$contact_fct
[1] 166
$contact_simple
[1] 5
$spomail_fct
[1] 111
$spomail_simple
[1] 5
$jobtitle_fct
[1] 12438
$jobtitle_simple
[1] 6
$deg1_fct
[1] 53
$deg1_simple
[1] 6
$deg2_fct
[1] 43
$deg2_simple
[1] 6
$maj1_fct
[1] 1276
$maj1_simple
[1] 6
$maj2_fct
[1] 364
$maj2_simple
[1] 6
$min1_fct
[1] 930
$min1_simple
[1] 6
$min2_fct
[1] 52
$min2_simple
[1] 7
$school1_fct
[1] 17
$school1_simple
[1] 6
$school2_fct
[1] 17
$school2_simple
[1] 6
$insttype_fct
[1] 3293
$insttype_simple
[1] 6
$extra_fct
[1] 8591
$extra_simple
[1] 6
$hhfirstgift_fct
[1] 818
$hhfirstgift_simple
[1] 6
$ch1_enroll_fct
[1] 8
$ch1_enroll_simple
[1] 5
$ch1_maj_fct
[1] 1309
$ch1_maj_simple
[1] 5
$ch1_min_fct
[1] 848
$ch1_min_simple
[1] 5
$ch1_school_fct
[1] 21
$ch1_school_simple
[1] 5
$ch1_feeder_fct
[1] 3013
$ch1_feeder_simple
[1] 5
$ch2_enroll_simple
[1] 5
$ch2_maj_fct
[1] 283
$ch2_maj_simple
[1] 5
$ch2_min_fct
[1] 146
$ch2_min_simple
[1] 5
$ch2_school_fct
[1] 19
$ch2_school_simple
[1] 5
$ch2_feeder_fct
[1] 331
$ch2_feeder_simple
[1] 5
$region_fct
[1] 8
$region_simple
[1] 6
$age_range_fct
[1] 10
$Married_simple
[1] 0
$hh.lifetime.giving_fct
[1] 7067
$HH.Lifetime.Giving.Plus
[1] 0
$Lifetime.Giving.Plus
[1] 0
$average_yearly_donation
[1] 0
ridge_fit2 <- cv.glmnet(HH.Lifetime.Giving.Plus ~ sex_fct + Age + school1_simple + insttype_simple + extra_simple + Married + donorseg_simple + nmb_degree + No_of_Children + jobtitle_simple,
data = data_train,
alpha = 0)
#Alpha 0 sets the Ridge
print(ridge_fit2)
Call:
cv.glmnet.formula(formula = HH.Lifetime.Giving.Plus ~ sex_fct +
Age + school1_simple + insttype_simple + extra_simple + Married +
donorseg_simple + nmb_degree + No_of_Children + jobtitle_simple,
data = data_train, alpha = 0)
Model fitting options:
Sparse model matrix: FALSE
Use model.frame: FALSE
Number of crossvalidation folds: 10
Alpha: 0
Deviance-minimizing lambda: 0.1005957 (+1 SE): 1.240191
print(ridge_fit2$lambda.min)
[1] 0.1005957
print(ridge_fit2$lambda.1se)
[1] 1.240191
plot(ridge_fit2)
ridge_coefs <- data.frame(
'ridge_min' = coef(ridge_fit2, s = ridge_fit2$lambda.min) %>% round(3) %>% as.matrix() %>% as.data.frame(),
'ridge_1se' = coef(ridge_fit2, s = ridge_fit2$lambda.1se) %>% round(3) %>% as.matrix() %>% as.data.frame()
) %>% rename('ridge_min' = 1, 'ridge_1se' = 2)
ridge_coefs
NA
Lasso model 1
#Using cv.glmnet from class
#ls(data_train)
#is.factor(data_train$major_gifter)
#glimpse(data_train$Lifetime.Giving)
#data_train %>%
# select_if(is.factor) %>%
# glimpse()
# library(glmnet)
# library(glmnetUtils)
# lasso_fit <- cv.glmnet(HH.Lifetime.Giving.Plus ~ sex_fct + jobtitle_simple + nmb_degree + school1_simple + hhfirstgift_simple + maj1_simple + donorseg_simple + No_of_Children + Married,
# data = data_train,
# #Alpha 1 for lasso
# alpha = 1)
# print(lasso_fit$lambda.min)
# #
# print(lasso_fit$lambda.1se)
#
# plot(lasso_fit)
Lasso model 2
lasso_fit2 <- cv.glmnet(HH.Lifetime.Giving.Plus ~ sex_fct + Age + school1_simple + insttype_simple + extra_simple + Married + donorseg_simple + nmb_degree + No_of_Children + jobtitle_simple,
data = data_train,
#Alpha 1 for lasso
alpha = 1)
print(lasso_fit2$lambda.min)
[1] 0.02218267
#
print(lasso_fit2$lambda.1se)
[1] 0.1564943
plot(lasso_fit2)
Checking Lasso model 2 for coeff shrink
coef(lasso_fit2)
40 x 1 sparse Matrix of class "dgCMatrix"
s1
(Intercept) 2.977390190336220321399
sex_fctF -0.063295421071888027797
sex_fctM 0.000000000000006610965
sex_fctU .
sex_fctX .
sex_fct(Missing) .
Age 0.047469162883246249218
school1_simpleCollege of Health and Behavioral Sciences .
school1_simpleDonna Ford Attallah College of Educational Studies .
school1_simpleGeorge L. Argyros School of Business and Economics 0.034787105908194705661
school1_simpleLawrence and Kristina Dodge Coll of Film & Media .
school1_simpleWilkinson Coll of Arts Humanities & Soc Sciences .
school1_simpleOther .
insttype_simpleGraduate Degree .
insttype_simpleLaw JD Full-Time Program .
insttype_simpleUndergraduate Degree .
insttype_simpleUndergraduate Degree | Undergraduate Degree .
insttype_simpleUndergraduate Degree | Undergraduate Degree | Undergraduate Degree .
insttype_simpleOther 0.062854709638579198128
extra_simpleChapman Choir Tour .
extra_simpleDisciples on Campus .
extra_simpleFootball .
extra_simpleInternational Student .
extra_simpleWorld Campus Afloat/Sem at Sea .
extra_simpleOther .
MarriedN -0.370468673088656275105
MarriedY 0.000809164203377083099
donorseg_simpleAt-Risk Donor .
donorseg_simpleCurrent Donor 1.775624909368799908549
donorseg_simpleLapsed Donor .
donorseg_simpleLapsing Donor 0.304058775117803548049
donorseg_simpleLost Donor -0.330156786744252639387
nmb_degree .
No_of_Children .
jobtitle_simpleAttorney .
jobtitle_simpleOwner .
jobtitle_simplePresident .
jobtitle_simpleTeacher .
jobtitle_simpleUnknown Position .
jobtitle_simpleOther .
#Default setting is lambda.1se
#From the book - showing convergence with lambda values
plot(lasso_fit2$glmnet.fit, xvar="lambda")
#abline(v=log(c(lasso_fit$lambda.min, lasso_fit$lambda.1se)), lty=2)
Elasticnet model part 1
enet_mod <- cva.glmnet(HH.Lifetime.Giving.Plus ~ sex_fct + Age + school1_simple + insttype_simple + extra_simple + Married + donorseg_simple + nmb_degree + No_of_Children + jobtitle_simple,
data = data_train,
alpha = seq(0,1, by = 0.1))
print(enet_mod)
Call:
cva.glmnet.formula(formula = HH.Lifetime.Giving.Plus ~ sex_fct +
Age + school1_simple + insttype_simple + extra_simple + Married +
donorseg_simple + nmb_degree + No_of_Children + jobtitle_simple,
data = data_train, alpha = seq(0, 1, by = 0.1))
Model fitting options:
Sparse model matrix: FALSE
Use model.frame: FALSE
Alpha values: 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1
Number of crossvalidation folds for lambda: 10
plot(enet_mod)
NA
NA
Elasticnet model part 2
minlossplot(enet_mod,
cv.type = "min")
get_alpha <- function(fit) {
alpha <- fit$alpha
error <- sapply(fit$modlist,
function(mod) {min(mod$cvm)})
alpha[which.min(error)]
}
get_model_params <- function(fit) {
alpha <- fit$alpha
lambdaMin <- sapply(fit$modlist, `[[`, "lambda.min")
lambdaSE <- sapply(fit$modlist, `[[`, "lambda.1se")
error <- sapply(fit$modlist, function(mod) {min(mod$cvm)})
best <- which.min(error)
data.frame(alpha = alpha[best], lambdaMin = lambdaMin[best],
lambdaSE = lambdaSE[best], eror = error[best])
}
best_alpha <- get_alpha(enet_mod)
print(best_alpha)
[1] 0.4
get_model_params(enet_mod)
best_mod <- enet_mod$modlist[[which(enet_mod$alpha == best_alpha)]]
print(best_mod)
Call: glmnet::cv.glmnet(x = x, y = y, weights = ..1, offset = ..2, nfolds = nfolds, foldid = foldid, alpha = a)
Measure: Mean-Squared Error
Lambda Index Measure SE Nonzero
min 0.04195 42 2.314 0.07425 25
1se 0.24571 23
minlossplot(enet_mod, cv.type = "min")
Ridges plots - could be useful for plotting donations vs donor segment
library('ggridges')
summary(data_train$variable)
Length Class Mode
0 NULL NULL
ggplot(data_train, aes(x = HH.Lifetime.Giving, y = region)) + geom_density_ridges(rel_min_height = 0.005) + xlim(c(25000, 100000)) +
ggtitle("HH Lifetime Giving by Region")
Picking joint bandwidth of 8190
#removing ID zip and nonnumeric
corrplot_data <- dataclean[-c(1:48,52:56,58:60,63,66:67,70:72,74:81,83:138)]
#Convert from character to numeric data type
convert_fac2num <- function(x){
as.numeric(as.factor(x))
}
corrplot_data <- mutate_at(corrplot_data,
.vars = c(1:12),
.funs = convert_fac2num)
#making a matrix
#creating correlation
col <- colorRampPalette(c("#BB4400", "#EE9990",
"#FFFFFF", "#77AAEE", "#4477BB"))
corrplot(cd_cor, method="color", col=col(100),
type="lower", addCoef.col = "black",
tl.pos="lt", tl.col="black",
tl.cex=0.7, tl.srt=45,
number.cex=0.7,
diag=FALSE)
#correlation matrix
# pairs(~Age + Months.Since.Last.Gift + donorseg_fct +
# nmb_degree + No_of_Children + HH.First.Gift.Age + HH.First.Gift.Amount + Total.Giving.Years,
# col = corrplot_data$HH.Lifetime.Giving,
# data = corrplot_data,
# main = "Donor Scatter Plot Matrix")
#worthless..
ggplot(data = corrplot_data, aes(x = nmb_degree, y = HH.Lifetime.Giving)) +
geom_point(aplha = 1/10)+
geom_smooth(method = "lm", color ="red")
`geom_smooth()` using formula 'y ~ x'
Random Forest
training <- subset(dataclean, select= c(major_gifter,Age,sex_fct,Lifetime.Giving.Plus,jobtitle_simple
,No_of_Children,region_fct,nmb_degree,Assignment_flag
,donorseg_simple,Months.Since.Last.Gift,Last.Contact.Age
,age_range_fct,school1_simple,insttype_simple,HH.First.Gift.Amount,Total.Giving.Years
,average_yearly_donation))
rf_fit_donor <- randomForest(Lifetime.Giving.Plus ~ .,
data = training,
type = classification,
mtry = 5,
na.action = na.roughfix,
ntree = 50,
importance=TRUE
)
print(rf_fit_donor)
Call:
randomForest(formula = Lifetime.Giving.Plus ~ ., data = training, type = classification, mtry = 5, ntree = 50, importance = TRUE, na.action = na.roughfix)
Type of random forest: regression
Number of trees: 50
No. of variables tried at each split: 5
Mean of squared residuals: 0.1117669
% Var explained: 97.87
plot(rf_fit_donor)
varImpPlot(rf_fit_donor, sort = TRUE,
n.var = 5,
type = 2, class = NULL, scale = TRUE,
main = deparse(substitute(rf_fit_donor)))
varImpPlot(rf_fit_donor,type = 1,scale = FALSE,n.var = 5,sort = TRUE)
#Plotting Depth of Node usage
# plot_min_depth_distribution(
# rf_fit_donor,
# k = 10,
# min_no_of_trees = 0,
# mean_sample = "top_trees",
# mean_scale = FALSE,
# mean_round = 2,
# main = "Distribution of minimal depth and its mean"
# )
plot_min_depth_distribution(rf_fit_donor)
#Splitting Category out to check if the category is useful for analysis
#data_category_split_out <- dataclean %>%
# mutate(Category.Codes = trim(strsplit(as.character(Category.Codes), "|", fixed = TRUE))) %>%
# unnest(Category.Codes) %>% pivot_wider(names_from = Category.Codes,values_from =Category.Codes, values_fn = length)
#We ran analysis and did not find the data useful